import re
import json
import random
import numpy as np
import pandas as pd
from glob import glob
from textwrap import wrap
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from pymagnitude import *
from tqdm import tqdm
%matplotlib inline
import warnings
rcParams['figure.figsize'] = 16, 5
warnings.filterwarnings("ignore",category=DeprecationWarning)
stop_words = set(stopwords.words('english'))
latex_regex = r"(\$+)(?:(?!\1)[\s\S])*\1"
lmtzr = WordNetLemmatizer()
def color_func(**kwargs):
return f"hsl(0, {random.randint(0, 50)}%, 50%)"
def grey_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
C:\Users\simon\anaconda3\envs\py388\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning. warnings.warn(msg)
with open('../generated_material/Conference_Id_name.json',encoding='utf-8-sig') as f:
conference_data = json.load(f)
conference_data_dict = dict()
for conf_data in conference_data:
conference_data_dict[conf_data['Id_Venue']] = conf_data['conference name']
repr_df_files = glob("*.csv")
df = pd.DataFrame()
for repr_df_file in repr_df_files:
repr_df = pd.read_csv(repr_df_file)
df = df.append(repr_df[:950])
df.head()
| ID_Article | communityId | ID_RelatedVenue | title | year | abstract | |
|---|---|---|---|---|---|---|
| 0 | 3052207 | 20178 | 369 | Green Cellular Demand Control with User-in-the... | 2016 | Mobile communications demand for data grows al... |
| 1 | 2847777 | 20178 | 369 | Synthesis of Radiation Patterns in Arbitrary G... | 2016 | A novel algorithm is presented for determining... |
| 2 | 3007663 | 20178 | 369 | R-OFDM Transmission Scheme for Visible Light C... | 2016 | White light-emitting diode (LED) consisting of... |
| 3 | 2875774 | 20178 | 369 | A Study on Channel Modeling in Tunnel Scenario... | 2016 | A new approach based on conventional propagati... |
| 4 | 3107243 | 20178 | 369 | Indoor Channel Measurements Using a 28GHz mult... | 2016 | Millimeter-wave (mmW) wireless is as a promisi... |
df['communityId'].value_counts()
to see which years account for majority of the years presented in
ax = df[['year']].sort_values(by='year').value_counts(sort=False).plot(figsize=(16,5),
title='Distribution of articles against the year it was presented in')
ax.set_xlabel('Publication Year')
ax.set_ylabel('No. of articles published')
plt.legend(["Published article"])
plt.show()
Wie erwartet werden es immer mehr Veröffentlichungen pro Jahr. Im Jahr 2016 fanden auffällig viele Veröffentlichungen statt
for comm in df.communityId.unique():
to_plot = df[df['communityId'] == comm].sort_values(by='year')['year'].value_counts(sort=False)
years = to_plot.index
counts = to_plot.values
plt.plot(years, counts, label = comm)
plt.title('Distribution of articles of the different communities against the year it was presented in')
plt.legend()
plt.show()
confs = df.groupby("ID_RelatedVenue").filter(lambda x: len(x) > 20)['ID_RelatedVenue'].unique()
for conf in confs:
print(f"{conf}: {conference_data_dict[conf]}")
369: vehicular technology conference 8228: international conference on communications 11470: international conference on multimedia and expo 8494: international symposium on circuits and systems 10228: wireless communications and networking conference 9078: international conference on image processing 30: international conference of the ieee engineering in medicine and biology society 21106: international conference on computer vision 8806: acm symposium on applied computing 23735: intelligent robots and systems 9475: conference on decision and control 9099: acm multimedia 11321: international conference on machine learning 422: knowledge discovery and data mining 256: international conference on learning representations 390: international symposium on biomedical imaging 8960: neural information processing systems 20358: international world wide web conferences 9616: international conference on pattern recognition 11166: international conference on data mining 11104: international geoscience and remote sensing symposium 20332: national conference on artificial intelligence 9463: north american chapter of the association for computational linguistics 8502: workshop on applications of computer vision 9804: conference of the international speech communication association 535: ieee automatic speech recognition and understanding workshop 65: robot and human interactive communication 8441: american control conference 20561: hawaii international conference on system sciences 9896: conference on computer supported cooperative work
conf_abbvs = { 11104: 'IGARSS', 535: 'ASRU', 256: 'ICLR', 9463: 'NAACL', 390: 'ISBI', 8960: 'NIPS', 8441: 'ACC', 65: 'RO-MAN', 11166: 'ICDM', 9804: 'ISCA', 422: 'KDD', 20332: 'AAAI', 20561: 'HICSS', 9896: 'CSCW', 20358: 'WWW', 11321: 'ICML', 9475: 'CDC', 8806: 'SAC', 21106: 'ICCV', 30: 'EMBC', 11470: 'ICME', 10228: 'WCNC', 8494: 'ISCAS', 369: 'VTC', 23735: 'IROS', 8502: 'WACV', 9616: 'ICPR', 8228: 'ICC', 9099: 'ACM', 9078: 'ICIP' }
plot_df = df.groupby("ID_RelatedVenue").filter(lambda x: len(x) > 20)
plot_df['ID_RelatedVenue'] = plot_df['ID_RelatedVenue'].apply(lambda x: conf_abbvs[x])
results = pd.crosstab(plot_df['ID_RelatedVenue'], plot_df['communityId'])
results.plot.bar()
plt.show()
Man sieht, dass die meisten Artikel jeder Community hauptsächlich in einem Venue presentiert werden.
Um die Artikel Communityübergreifend zu analyisieren, werden jetzt nur die Titel der Artikel genutzt. Könnte auch die Abstracts nehmen, aber das macht ja nicht so einen großen Unterschied, wie man in den einzelnen Analysen auch schon gesehen hat. Spart Memory und machts auch einfacher.
plot_df = pd.DataFrame()
stop_words.update(['based', 'using', 'A', 'The', 'system', 'algorithm'])
plot_df['tokenized_col'] = df.apply(lambda row: (re.sub("[^A-Za-z0-9' ]+", ' ', row['title'])),axis=1)
plot_df['tokenized_col'] = plot_df.apply(lambda row: row['tokenized_col'].lower(), axis = 1)
plot_df['tokenized_col'] = plot_df.apply(lambda row: (word_tokenize(row['tokenized_col'])), axis = 1)
plot_df['tokenized_col'] = plot_df.apply(lambda row: ([w for w in row['tokenized_col'] if w not in stop_words]), axis=1)
plot_df['tokenized_col'] = plot_df.apply(lambda row: ([lmtzr.lemmatize(w) for w in row['tokenized_col']]), axis=1)
plot_df['col'] = plot_df.apply(lambda row: ' '.join(row['tokenized_col']), axis=1)
plot_df['communityId'] = df['communityId']
plot_df
| tokenized_col | col | communityId | |
|---|---|---|---|
| 0 | [green, cellular, demand, control, user, loop,... | green cellular demand control user loop enable... | 20178 |
| 1 | [synthesis, radiation, pattern, arbitrary, geo... | synthesis radiation pattern arbitrary geometry... | 20178 |
| 2 | [r, ofdm, transmission, scheme, visible, light... | r ofdm transmission scheme visible light commu... | 20178 |
| 3 | [study, channel, modeling, tunnel, scenario, p... | study channel modeling tunnel scenario propaga... | 20178 |
| 4 | [indoor, channel, measurement, 28ghz, multi, b... | indoor channel measurement 28ghz multi beam mi... | 20178 |
| ... | ... | ... | ... |
| 945 | [pano, presence, teleoperation] | pano presence teleoperation | 22736 |
| 946 | [design, evaluation, rapid, programming, servi... | design evaluation rapid programming service robot | 22736 |
| 947 | [optimization, design, novel, hybrid, aerial, ... | optimization design novel hybrid aerial ground... | 22736 |
| 948 | [automatic, synthesis, communication, coordina... | automatic synthesis communication coordinated ... | 22736 |
| 949 | [multi, robot, exploration, unknown, environme... | multi robot exploration unknown environment id... | 22736 |
8818 rows × 3 columns
Wordcloud for each community to see which are the most common terms and words tat appear in the titles of the articles. To get an idea of what the topics of the community are.
# Function for generating word clouds
def generate_wordcloud(data, communityId):
wc = WordCloud(background_color="black", max_words=2000, stopwords=stop_words.update([]), width=800, height=600)
wc.generate(" ".join(word_data))
wc.recolor(color_func=color_func, random_state=17)
plt.figure(figsize=(15,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title('\n'.join(wrap('CommunityId: ' + str(communityId),60)),fontsize=13)
plt.show()
# Plotting word cloud for each communityId
for communityId in plot_df['communityId'].unique():
word_data = plot_df[plot_df['communityId'] == communityId]['col'].values
generate_wordcloud(word_data, communityId)
# Magnitude File: http://magnitude.plasticity.ai/fasttext/medium/wiki-news-300d-1M-subword.magnitude
fasttext = Magnitude("vectors/wiki-news-300d-1M-subword.magnitude")
idf_dicts = dict()
for comm in df['communityId'].unique():
tfidf = TfidfVectorizer()
tfidf.fit(plot_df[df['communityId'] == comm]['col'].values)
idf_dicts[comm] = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
def tfidf_fasttext(df):
vectors = []
for title, comm in tqdm(zip(df.title.values, df.communityId.values)):
fasttext_vectors = fasttext.query(word_tokenize(title))
weights = [idf_dicts[comm].get(word, 1) for word in word_tokenize(title)]
vectors.append(np.average(fasttext_vectors, axis = 0, weights = weights))
return np.array(vectors)
vectors = tfidf_fasttext(df)
8818it [02:43, 53.85it/s]
rcParams['figure.figsize'] = 48, 15
tsne = TSNE(n_components = 2, n_jobs= -1, verbose = 0, perplexity = 30)
tsne_data = tsne.fit_transform(vectors)
tsne_data = pd.DataFrame(tsne_data, columns = ['X', 'Y'])
tsne_data['communityId'] = df['communityId'].values
sns.scatterplot(x = 'X', y = 'Y', hue='communityId', palette=sns.color_palette("hls", 10), data = tsne_data)
plt.title('TSNE on IDF-Fasttext Title Embeddings')
plt.show()
rcParams['figure.figsize'] = 16, 5
Die Title Embeddings variieren über den ganzen 2D Raum. Aber man sieht, dass die einzelnen communities an manchen Stellen auch eng aneinander sind. 20178 ist auffällig seperiert von den anderen Gruppen (wäre interessant, das weiter zu investigieren). Es kristallisieren sich aus dieser Perspektive drei übergeordnete Communities. Eine sehr große oben, eine mittlere unten und die 20178 Gruppe. Was haben diese Gruppierungen gemeinsam? Sind diese vielleicht auch in ähnliche Themengebiete einteilbar?